/*************************************************************************
 * The contents of this file are subject to the MYRICOM MYRINET          *
 * EXPRESS (MX) NETWORKING SOFTWARE AND DOCUMENTATION LICENSE (the       *
 * "License"); User may not use this file except in compliance with the  *
 * License.  The full text of the License can found in LICENSE.TXT       *
 *                                                                       *
 * Software distributed under the License is distributed on an "AS IS"   *
 * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied.  See  *
 * the License for the specific language governing rights and            *
 * limitations under the License.                                        *
 *                                                                       *
 * Copyright 2003 - 2004 by Myricom, Inc.  All rights reserved.          *
 *************************************************************************/

/*
 * Transmit packets from every host to every host randomly
 */
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <math.h>
#include <assert.h>

#include "mpi.h"

#define DFLT_MSGCNT 10
#define DFLT_MSGLEN 4096

#define MAX_RECVBUF_BYTES 250*1024*1024
#define NUM_SEND_REC 32

#if 0
#define DBPRINT(S) printf S
#else
#define DBPRINT(S)
#endif

struct sortitem {
 int key;
 int rank;
};

int comp_sortitem(const void *, const void *);
int
comp_sortitem(
  const void *v1,
  const void *v2)
{
  const struct sortitem *i1 = v1;
  const struct sortitem *i2 = v2;

  return i1->key < i2->key ? -1 :
         i1->key > i2->key ? 1 :
         0;
}

main(int argc, char **argv)
{
  int np, myid;
  int msglen;
  int msgcnt;
  MPI_Status *sendstatus;
  int *sendindex;
  int num_send_complete;
  struct sortitem *ranklist;
  char **buf;
  char **recvbuf;
  MPI_Status *recvstatus;
  int *recvindex;
  MPI_Request *recvreq;
  int num_recv_complete;
  int total_recv;
  int total_send;
  int sends_to_do;
  MPI_Request *sendreq;
  int flag;
  int nrecvbuf;
  char *txbuf;
  int send_pending;
  int max_send_pending;
  int iter;
  int i;
  int j;
  int c;
  double total_bytes;
  double start,end;
  extern char *optarg;

  msglen = DFLT_MSGLEN;
  msgcnt = DFLT_MSGCNT;

  MPI_Init(&argc,&argv);

  while ((c = getopt(argc, argv, "n:l:")) != EOF) switch (c) {
  case 'n':
    msgcnt = atoi(optarg);
    break;
  case 'l':
    msglen = atoi(optarg);
    break;
  }

  MPI_Comm_size(MPI_COMM_WORLD,&np);
  MPI_Comm_rank(MPI_COMM_WORLD,&myid);

  /* allocate send related stuff */
  sends_to_do = (np-1)*msgcnt;
  max_send_pending = NUM_SEND_REC;
  if (max_send_pending > sends_to_do) max_send_pending = sends_to_do;

  txbuf = malloc(msglen);
  sendreq =  (MPI_Request *) calloc(sizeof(MPI_Request), max_send_pending);
  sendindex = (int *) calloc(sizeof(int), max_send_pending);
  sendstatus = (MPI_Status *) calloc(sizeof(MPI_Status), max_send_pending);
  assert(txbuf != NULL);
  assert(sendreq != NULL);
  assert(sendindex != NULL);
  assert(sendstatus != NULL);

  /* pick a good number for recvbuf */
  nrecvbuf = sqrt(np*msgcnt);
  if (nrecvbuf < 4) nrecvbuf = 4;
  if (nrecvbuf * msglen > MAX_RECVBUF_BYTES) {
    nrecvbuf = MAX_RECVBUF_BYTES / msglen;
  }

  /* allocate and start a few receives to keep everyone somewhat happy */
  recvbuf = (char **) malloc(nrecvbuf * sizeof(char *));
  recvreq = (MPI_Request *) calloc(sizeof(MPI_Request), nrecvbuf);
  recvindex = (int *) calloc(sizeof(int), nrecvbuf);
  recvstatus = (MPI_Status *) calloc(sizeof(MPI_Status), nrecvbuf);

  assert(recvbuf != NULL);
  assert(recvreq != NULL);
  assert(recvindex != NULL);
  assert(recvstatus != NULL);
  DBPRINT(("alloc %d bufs\n", nrecvbuf));

  for (i=0; i<nrecvbuf; ++i) {
    recvbuf[i] = malloc(msglen);
    if (recvbuf[i] == NULL) {
      perror("malloc recvbuf");
      MPI_Abort(MPI_COMM_WORLD, 1);
      exit(1);
    }

    MPI_Irecv(recvbuf[i], msglen, MPI_BYTE, MPI_ANY_SOURCE, 0,
	MPI_COMM_WORLD, &recvreq[i]);
  }
  DBPRINT(("gen list\n"));

  /*
   * generate a randonly ordered list of the hosts*iterations
   */
  ranklist = (struct sortitem *)
	malloc(sizeof(struct sortitem) * np * msgcnt);
  assert(ranklist != 0);
  for (iter=0; iter<np; ++iter) {
    if (iter == myid) continue;
    if (iter > myid) i = iter-1;
    else i=iter;
    for (j=0; j<msgcnt; ++j) {
      ranklist[i*msgcnt + j].key = random();
      ranklist[i*msgcnt + j].rank = iter;
    }
  }
  DBPRINT(("sorting list id=%d\n", myid));


  /* randomize the host order */
  qsort(ranklist, sends_to_do, sizeof(*ranklist), comp_sortitem);

#if 0
  if (myid == 0) {
    printf("host order:\n");
    for (i=0; i<sends_to_do; ++i) {
      printf("\t%d: %d %d\n", i, ranklist[i].rank, ranklist[i].key);
    }
  }
#endif

  /* init things and start the timer */
  send_pending = 0;
  total_recv = 0;
  total_send = 0;

  start = MPI_Wtime();

  /* Put sends in flight */
  while (send_pending < max_send_pending) {
    MPI_Isend(txbuf, msglen, MPI_BYTE, ranklist[total_send].rank, 0, 
	MPI_COMM_WORLD, &sendreq[total_send]);
    ++total_send;
    ++send_pending;
  }

  /* Loop until all sends are done */
  end = MPI_Wtime();
  while (send_pending > 0) {

    if (myid == 0 && (MPI_Wtime() - end > 10)) {
      printf("[%d] send %d/%d\n", myid, total_send, sends_to_do);
      end = MPI_Wtime();
    }

    DBPRINT(("[%d] test %d recvs\n", myid, nrecvbuf));
    MPI_Testsome(nrecvbuf, recvreq,
	&num_recv_complete, recvindex, recvstatus);
    DBPRINT(("[%d] found %d recvs\n", myid, num_recv_complete));
    total_recv += num_recv_complete;
    for (i=0; i<num_recv_complete; ++i) {
      j = recvindex[i];
      DBPRINT(("[%d] recv %d complete, src=%d, err=%d\n", myid, j, recvstatus[j].MPI_SOURCE, recvstatus[j].MPI_ERROR));
      MPI_Irecv(recvbuf[j], msglen, MPI_BYTE, MPI_ANY_SOURCE, 0,
	MPI_COMM_WORLD, &recvreq[j]);
    }

    DBPRINT(("[%d] test %d sends\n", myid, max_send_pending));

    MPI_Testsome(max_send_pending, sendreq,
	&num_send_complete, sendindex, sendstatus);
    DBPRINT(("[%d] found %d sends\n", myid, num_send_complete));
    for (i=0; i<num_send_complete; ++i) {
      if (total_send < sends_to_do) {
	j = sendindex[i];
    DBPRINT(("[%d] send %d complete, err=%d\n", myid, j, recvstatus[j].MPI_ERROR));
    DBPRINT(("[%d] send[%d] to %d\n", myid, j, ranklist[total_send].rank));
	MPI_Isend(txbuf, msglen, MPI_BYTE, ranklist[total_send].rank, 0, 
	    MPI_COMM_WORLD, &sendreq[j]);
	++total_send;
      } else {
        --send_pending;
      }
    }
  }

printf("[%d] all sends complete\n", myid);


  /* wait for all receives to complete */
  while (total_recv < sends_to_do) {
    MPI_Waitsome(nrecvbuf, recvreq, &num_recv_complete, recvindex, recvstatus);
    total_recv += num_recv_complete;
    for (i=0; i<num_recv_complete; ++i) {
      j = recvindex[i];
    DBPRINT(("[%d] recv2 %d(%d) complete, src=%d, err=%d\n", myid, j, total_recv,recvstatus[j].MPI_SOURCE, recvstatus[j].MPI_ERROR));
      MPI_Irecv(recvbuf[j], msglen, MPI_BYTE, MPI_ANY_SOURCE, 0,
	MPI_COMM_WORLD, &recvreq[j]);
    }
  } 
  end = MPI_Wtime();

  total_bytes = (double)msglen * total_send * 2;
  printf("%.2f seconds elapsed, BW = %f MB/s\n", end-start,
  	total_bytes/1024/1024/(end-start));

  /* cancel all pending receives here, but i am lazy... */

  MPI_Finalize();

  return 0;
}
